Install if needed
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("clusterProfiler")
# BiocManager::install("msigdbr")
# BiocManager::install("org.Rn.eg.db")
# BiocManager::install("enrichplot")
# BiocManager::install("pathview")
Load Packages
library(tidyverse) # for dplyr, ggplot, stringr, readr etc
library(clusterProfiler) #GSEA includes gsea, enrichplot etc.
library(org.Rn.eg.db) # Rat genome annotation package - incl. AnnotationDbi
library(msigdbr) # MSigDB gene sets
library(enrichplot) # for dotplot()
library(ggridges) # for ridgeplot()
library(pathview) # pathview() - Kegg pathway visualisation
# Set working directory to source file location
# read in protein list df from MSstatsTMT comparison
df <- read_csv(
"input/protein_lists/2025_09_12_rp_1pspg_test_pairwise_msstats_v3.1.5_t8.csv")
## New names:
## Rows: 7746 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): Protein, Label, issue dbl (6): ...1, log2FC, SE, DF, pvalue, adj.pvalue
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# lil checks
head(df) # peak
## # A tibble: 6 × 9
## ...1 Protein Label log2FC SE DF pvalue adj.pvalue issue
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 A0A096MJ11 EE vs SH 0.0145 0.0369 32.3 0.697 0.934 <NA>
## 2 2 A0A096MJ32 EE vs SH -0.354 0.151 12 0.0365 0.536 <NA>
## 3 3 A0A096MJ78 EE vs SH -0.0986 0.0317 57.7 0.00288 0.306 <NA>
## 4 4 A0A096MJ94 EE vs SH -0.0598 0.182 12 0.749 0.950 <NA>
## 5 5 A0A096MJ99 EE vs SH -0.0497 0.0329 51.4 0.138 0.696 <NA>
## 6 6 A0A096MJA9 EE vs SH 0.0167 0.0590 46.4 0.778 0.955 <NA>
str(df) # structure | 7746 x 9
## spc_tbl_ [7,746 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : num [1:7746] 1 2 3 4 5 6 7 8 9 10 ...
## $ Protein : chr [1:7746] "A0A096MJ11" "A0A096MJ32" "A0A096MJ78" "A0A096MJ94" ...
## $ Label : chr [1:7746] "EE vs SH" "EE vs SH" "EE vs SH" "EE vs SH" ...
## $ log2FC : num [1:7746] 0.0145 -0.3543 -0.0986 -0.0598 -0.0497 ...
## $ SE : num [1:7746] 0.0369 0.1505 0.0317 0.1825 0.0329 ...
## $ DF : num [1:7746] 32.3 12 57.7 12 51.4 ...
## $ pvalue : num [1:7746] 0.69717 0.0365 0.00288 0.74896 0.13751 ...
## $ adj.pvalue: num [1:7746] 0.934 0.536 0.306 0.95 0.696 ...
## $ issue : chr [1:7746] NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_double(),
## .. Protein = col_character(),
## .. Label = col_character(),
## .. log2FC = col_double(),
## .. SE = col_double(),
## .. DF = col_double(),
## .. pvalue = col_double(),
## .. adj.pvalue = col_double(),
## .. issue = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
colnames(df) # column names
## [1] "...1" "Protein" "Label" "log2FC" "SE"
## [6] "DF" "pvalue" "adj.pvalue" "issue"
# lil checks (bit messy)
unique(df$issue) # if = na in issue column then good. if not exclude
## [1] NA "oneConditionMissing"
df %>% filter(str_detect(Protein, "Cont_")) # rows with contaminants - to exlude
## # A tibble: 21 × 9
## ...1 Protein Label log2FC SE DF pvalue adj.pvalue issue
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1875 Cont_O77727 EE vs SH 0.112 0.208 12 0.598 0.904 <NA>
## 2 1876 Cont_P00761 EE vs SH -0.0316 0.0486 51.3 0.519 0.882 <NA>
## 3 1877 Cont_P00883 EE vs SH -0.0477 0.0545 12 0.398 0.845 <NA>
## 4 1878 Cont_P02769 EE vs SH -0.102 0.134 54.5 0.449 0.864 <NA>
## 5 1879 Cont_P04264 EE vs SH -0.336 0.213 51.4 0.121 0.682 <NA>
## 6 1880 Cont_P09870 EE vs SH 0.0783 0.173 12 0.659 0.921 <NA>
## 7 1881 Cont_P13645 EE vs SH -0.386 0.180 53.6 0.0369 0.538 <NA>
## 8 1882 Cont_P19012 EE vs SH -0.0800 0.108 12 0.474 0.871 <NA>
## 9 1883 Cont_P30879 EE vs SH 0.0168 0.0297 40.0 0.576 0.897 <NA>
## 10 1884 Cont_P34955 EE vs SH -0.00468 0.0900 12 0.959 0.990 <NA>
## # ℹ 11 more rows
df %>% filter(str_detect(log2FC, "Inf")) # rows with Infinity /no log fold - to exclude
## # A tibble: 1 × 9
## ...1 Protein Label log2FC SE DF pvalue adj.pvalue issue
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 134 A0A0G2JYB7 EE vs SH Inf NA NA NA NA oneConditionMi…
df %>% filter(is.na(pvalue)) # same observation as the "Inf" in log2FC
## # A tibble: 1 × 9
## ...1 Protein Label log2FC SE DF pvalue adj.pvalue issue
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 134 A0A0G2JYB7 EE vs SH Inf NA NA NA NA oneConditionMi…
df %>% filter(is.na(adj.pvalue)) # same observation as the "Inf" in log2FC
## # A tibble: 1 × 9
## ...1 Protein Label log2FC SE DF pvalue adj.pvalue issue
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 134 A0A0G2JYB7 EE vs SH Inf NA NA NA NA oneConditionMi…
protein_comp <- df %>%
filter(is.na(issue)) %>% # keeps rows with issue == na (so comparisons that are ok)
filter(!str_detect(Protein, "Cont_")) %>% # remove contaminant proteins
filter(!str_detect(log2FC, "Inf")) %>% # remove infinity/missing fold change values
dplyr::select(Protein:adj.pvalue) # rm weird ...1 col & issue col, skip this?
# check
head(protein_comp)
## # A tibble: 6 × 7
## Protein Label log2FC SE DF pvalue adj.pvalue
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A0A096MJ11 EE vs SH 0.0145 0.0369 32.3 0.697 0.934
## 2 A0A096MJ32 EE vs SH -0.354 0.151 12 0.0365 0.536
## 3 A0A096MJ78 EE vs SH -0.0986 0.0317 57.7 0.00288 0.306
## 4 A0A096MJ94 EE vs SH -0.0598 0.182 12 0.749 0.950
## 5 A0A096MJ99 EE vs SH -0.0497 0.0329 51.4 0.138 0.696
## 6 A0A096MJA9 EE vs SH 0.0167 0.0590 46.4 0.778 0.955
str(protein_comp) # 7724 x 7
## tibble [7,724 × 7] (S3: tbl_df/tbl/data.frame)
## $ Protein : chr [1:7724] "A0A096MJ11" "A0A096MJ32" "A0A096MJ78" "A0A096MJ94" ...
## $ Label : chr [1:7724] "EE vs SH" "EE vs SH" "EE vs SH" "EE vs SH" ...
## $ log2FC : num [1:7724] 0.0145 -0.3543 -0.0986 -0.0598 -0.0497 ...
## $ SE : num [1:7724] 0.0369 0.1505 0.0317 0.1825 0.0329 ...
## $ DF : num [1:7724] 32.3 12 57.7 12 51.4 ...
## $ pvalue : num [1:7724] 0.69717 0.0365 0.00288 0.74896 0.13751 ...
## $ adj.pvalue: num [1:7724] 0.934 0.536 0.306 0.95 0.696 ...
colnames(protein_comp)
## [1] "Protein" "Label" "log2FC" "SE" "DF"
## [6] "pvalue" "adj.pvalue"
Convert UniProt Accessions into EntrezID using bitr()
Note Entrez id is a central gene identifier - so want to map uniprot accession with gene ID which then can be mapped to other IDs.
keytypes(org.Rn.eg.db) # key types available
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GENETYPE" "GO" "GOALL" "IPI" "ONTOLOGY"
## [16] "ONTOLOGYALL" "PATH" "PFAM" "PMID" "PROSITE"
## [21] "REFSEQ" "SYMBOL" "UNIPROT"
# 7158/7724 mapped (8.05% missing)
entrezID <- bitr(protein_comp$Protein, # input df
fromType = "UNIPROT", # input type
toType = "ENTREZID", # output type
OrgDb = org.Rn.eg.db) # annotation db
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(protein_comp$Protein, fromType = "UNIPROT", toType =
## "ENTREZID", : 8.05% of input gene IDs are fail to map...
# UNIPROT -> ENTREZID - 8.05% of input gene failed to map
# UNIPROT -> ENSEMBL 12.91% of input gene IDs are fail to map...
# UNIPROT -> REFSEQ - 8.05% of input gene IDs are fail to map...
# UNIPROT -> SYMBOL - 8.05% of input gene IDs are fail to map...
Merge EntrezID onto protein list
protein_genes <- merge(entrezID, protein_comp,
by.x = "UNIPROT",
by.y = "Protein")
head(protein_genes)
## UNIPROT ENTREZID Label log2FC SE DF pvalue
## 1 A0A096MJ11 315084 EE vs SH 0.01450424 0.03693997 32.25947 0.697165337
## 2 A0A096MJ32 302612 EE vs SH -0.35425998 0.15054311 12.00000 0.036498920
## 3 A0A096MJ78 308478 EE vs SH -0.09856990 0.03166410 57.67314 0.002881095
## 4 A0A096MJ94 362426 EE vs SH -0.05975939 0.18248997 12.00000 0.748957106
## 5 A0A096MJ99 364033 EE vs SH -0.04965287 0.03291150 51.35170 0.137509534
## 6 A0A096MJA9 312981 EE vs SH 0.01668589 0.05896103 46.41957 0.778436130
## adj.pvalue
## 1 0.9336827
## 2 0.5364025
## 3 0.3056723
## 4 0.9497256
## 5 0.6963620
## 6 0.9545115
dim(protein_genes)
## [1] 7158 8
#write out?
Prepare Ranked Gene List
# use sign(df$log2fc)*(-log10(df$pval)) + BSS & PNNL
# using
geneList_df <- protein_genes %>%
mutate(ranking = sign(protein_genes$log2FC)*(-log10(protein_genes$pvalue))) %>%
group_by(ENTREZID) %>%
summarise(ranking = mean(ranking, na.rm = TRUE)) %>%
arrange(-ranking)
# can export
dim(geneList_df) # 7153 x 2 - same as prev test!
## [1] 7153 2
# pvalue & log2FC min = -7.6191 max 4.559
# adj.pvalue * log2FC = max 1.3938 & min -3.7302
# ~ ~ ~ ~ ~ ~ ~
# convert into a named vector to supply to fgsea()
geneList <- tibble::deframe(geneList_df)
head(geneList)
## 288080 689954 29237 191595 299282 683788
## 4.559848 4.437793 4.129179 3.828775 3.765140 3.762137
tail(geneList)
## 360814 25379 363016 361663 114488 29637
## -4.306857 -4.335003 -4.474002 -4.556742 -4.753525 -7.619261
plot(geneList)
sum(is.na(geneList)) # 0 | without extra line (not sure why previous version had NA)
## [1] 0
species & orthologs - has human & mouse but can rat orthologs & use.
TO CHECK/DECIDE - Using mouse or human genesets?
species = "Mus musculus" if you have
db_species = "MM" ?Pathway enrichment analysis (uses enricher() from clusterProfiler or fgsea())
12 Universal enrichment analysis (ClusterProfiler)
Gene Set Enrichment Analysis with ClusterProfiler (NYU NGS)
# # msigdbr helper functions
# msigdbr_species() # 20 species
# msigdbr_collections() # 33730584 bytes?
# unique(msigdbr_df$db_version) # check ver
# citation("msigdbr") # how to cite
# Preps for fgsea()
# # example
# # Get CGP (chemical and genetic perturbations) gene sets with genes mapped to rat
# orthologs gs <- msigdbr(species = "Rattus norvegicus",
# collection = "C2",
# subcollection = "CGP")
# pull in hallmark genesets from msigdbr, maps rat orthologs to human geneset
hallmark_gs_df <- msigdbr(species = "Rattus norvegicus", # my species
collection = "H") # Hallmark collection
head(hallmark_gs_df)
## # A tibble: 6 × 23
## gene_symbol ncbi_gene ensembl_gene db_gene_symbol db_ncbi_gene db_ensembl_gene
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Abca1 313210 ENSRNOG0000… ABCA1 19 ENSG00000165029
## 2 Abcb8 362302 ENSRNOG0000… ABCB8 11194 ENSG00000197150
## 3 Acaa2 170465 ENSRNOG0000… ACAA2 10449 ENSG00000167315
## 4 Acadl 25287 ENSRNOG0000… ACADL 33 ENSG00000115361
## 5 Acadm 24158 ENSRNOG0000… ACADM 34 ENSG00000117054
## 6 Acads 64304 ENSRNOG0000… ACADS 35 ENSG00000122971
## # ℹ 17 more variables: source_gene <chr>, gs_id <chr>, gs_name <chr>,
## # gs_collection <chr>, gs_subcollection <chr>, gs_collection_name <chr>,
## # gs_description <chr>, gs_source_species <chr>, gs_pmid <chr>,
## # gs_geoid <chr>, gs_exact_source <chr>, gs_url <chr>, db_version <chr>,
## # db_target_species <chr>, ortholog_taxon_id <int>, ortholog_sources <chr>,
## # num_ortholog_sources <dbl>
dim(hallmark_gs_df) # 7294 x 23
## [1] 7294 23
# ~ ~ ~ ~ ~ ~ ~
# Convert in 2 name list format required for fgsea()
# Splits in separate lists containing Entrez grouped by the geneset name (gs_name)
# ENTREZID is ncbi_gene?
hallmark_genesets <- hallmark_gs_df %>%
split(x = .$ncbi_gene, # containing - df/vector
f = .$gs_name) # file - factor
#str(hallmark_genesets) # list of 50
# Preps TERM2GENE for GSEA() | 1st col = term_id/geneset & 2nd col = mapped gene
msig_h <- msigdbr(species = "Rattus norvegicus",
collection = "H") %>%
dplyr::select(gs_name, ncbi_gene) # get only geneset name & corresponding ENTREZID
head(msig_h)
## # A tibble: 6 × 2
## gs_name ncbi_gene
## <chr> <chr>
## 1 HALLMARK_ADIPOGENESIS 313210
## 2 HALLMARK_ADIPOGENESIS 362302
## 3 HALLMARK_ADIPOGENESIS 170465
## 4 HALLMARK_ADIPOGENESIS 25287
## 5 HALLMARK_ADIPOGENESIS 24158
## 6 HALLMARK_ADIPOGENESIS 64304
dim(msig_h) # 7294 x 2
## [1] 7294 2
Running gsea with GSEA()
# using GSEA() from clusterProfiler
# uses fgseaMultilevel() function? | auto
GSEA_hallmark <- GSEA(geneList = geneList, # ranked gene list
TERM2GENE = msig_h, # genesets
by = 'fgsea' # method - fgsea or DOSE
)
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
# Output has lots of extra info! results + genesets, Genelist, input params! -> save rda
# can make into tibble for easy viewing
GSEA_hallmark_tibble <- as_tibble(GSEA_hallmark)
Plots
enrichplot::dotplot(GSEA_hallmark,
showCategory = 15,
title = "Housing - EE vs. SH (t8 - t2)")
dotplot(GSEA_hallmark,
split = ".sign", # split by sign - positive vs negative enrichment score
title = "Housing - EE vs. SH (t8 - t2) - split by sign",
font.size = 11) +
facet_grid(.~.sign) # split plots by sign
GSEA plot using gseaplot() from enrichplot
# GSEA plot - positive - most sig [1] - Hallmark_oxidative_phosphorylation
gseaplot(GSEA_hallmark,
geneSetID = 1, # 1st geneset is 1
by = "all", # running score or position
title = GSEA_hallmark$Description[1])
# GSEA plot - positive - most sig [2] - Hallmark_MYC_targets_v1
gseaplot(GSEA_hallmark,
geneSetID = 2, # 2nd geneset is 2
by = "all", # running score or position
title = GSEA_hallmark$Description[2])
Ridgeplot
ridgeplot(GSEA_hallmark) + labs(x = "enrichment distribution")
## Picking joint bandwidth of 0.236
# Preps TERM2GENE for GSEA() | 1st col = term_id/geneset & 2nd col = mapped gene
msig_C5_CC <- msigdbr(species = "Rattus norvegicus",
collection = "C5", # C5 - ontology set
subcollection = "CC") %>% # GO cellular component
dplyr::select(gs_name, ncbi_gene)
dim(msig_C5_CC) # 98011 x 2 for C5 CC!!!
## [1] 98011 2
# using GSEA() from clusterProfiler
# uses fgseaMultilevel() function? | auto
GSEA_msig_C5_CC <- GSEA(geneList = geneList, # ranked gene list
TERM2GENE = msig_C5_CC, # genesets
#organism = "rno",
by = 'fgsea') # method - fgsea or DOSE
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 9 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
# had to restart R to fix error | worked after - see warnings.
# Output has lots of extra info! results + genesets, Genelist, input params! -> save rda
# can make into tibble for easy viewing
GSEA_msig_C5_CC_tibble <- as_tibble(GSEA_msig_C5_CC)
# 102 significant beyond adj pvalue 0.05
# neuronal & synaptic ones activated in EE. phew!
dotplot(GSEA_msig_C5_CC,
split = ".sign", # split by pos vs neg enrichment score
showCategory = 102,
font.size = 11,
label_format = 55,
title = "Housing - EE vs. SH (t8 - t2) - GSEA_msig_C5_CC") +
facet_grid(.~.sign) # split plots by sign
# simplify
sim_msig_go_cc <- pairwise_termsim(GSEA_msig_C5_CC)
gseGO_cc <- gseGO(geneList = geneList,
ont ="CC", # cell component
keyType = "ENTREZID",
OrgDb = org.Rn.eg.db, # if you unset it uses bg based on provide geneList?
by = "fgsea",
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = TRUE,
pAdjustMethod = "BH")
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 16 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
# can make into tibble for easy viewing
gseGO_cc_tibble <- as_tibble(gseGO_cc)
# 102 significant beyond adj pvalue 0.05
enrichplot::dotplot(gseGO_cc,
split = ".sign", # split by + vs - enrirchment score
showCategory = 102,
font.size = 11,
label_format = 55,
title = "Housing - EE vs. SH (t8 - t2) - gseGO_CC"
) +
facet_grid(.~.sign) # split plots by sign
# 6.6 Visualize enriched GO terms as a directed acyclic graph
# goplot requires ggarchery | plot induced GO DAG of significant terms
goplot(gseGO_cc,
showCategory = 25, # default 10
layout = "sugiyama") # default
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Gene-Concept Network with cnetplot()
# 15.4 Heatmap-like functional classification
#15.5 Tree plot
# The treeplot() function performs hierarchical clustering of enriched terms.
# Relies on the pairwise similarities of the enriched terms calc by the pairwise_termsim() func, which by default using Jaccard’s similarity index (JC).
gseGO_cc_pwts <- pairwise_termsim(gseGO_cc)
treeplot(gseGO_cc_pwts,
hclust_method = "average", # see hclust () documentation on methods
showCategory = 70, # default 30
nCluster = 10, # default 5 | parent term changes a lot & weird assign at times
)
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
## The hclust_method parameter will be removed in the next version.
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(n = your_value)' instead of 'nCluster'.
## The nCluster parameter will be removed in the next version.
# works but unstable assignment - look into
# 15.6 Enrichment Map
# also use gseGO_cc_pwts <- pairwise_termsim(gseGO_cc) from above
emapplot(gseGO_cc_pwts,
showCategory = 30, # default 30
# cex_category = 1.5, # can be used to resize nodes | not in ?emapplot?
# layout = "kk" # change layout | not in ?emapplot? nvm old doesn't accept?
)
# simplfy
sim_gseGO_cc <- simplify(gseGO_cc_pwts,
cutoff=0.7,
by="p.adjust",
select_fun=min)
# plot simplified
emapplot(sim_gseGO_cc)
gseGO_mf <- gseGO(geneList = geneList,
ont ="MF", # molecular function
keyType = "ENTREZID",
OrgDb = org.Rn.eg.db, # if you unset it uses bg based on provide geneList?
by = "fgsea",
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = TRUE,
pAdjustMethod = "BH")
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 4 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
# can make into tibble for easy viewing
gseGO_mf_tibble <- as_tibble(gseGO_mf)
# 63 significant beyond adj pvalue 0.05
enrichplot::dotplot(gseGO_mf,
split = ".sign", # split by + vs - enrirchment score
showCategory = 65,
font.size = 11,
label_format = 55,
title = "Housing - EE vs. SH (t8 - t2) - gseGO_mf"
) +
facet_grid(.~.sign) # split plots by sign
#15.5 Tree plot
gseGO_mf_pwts <- pairwise_termsim(gseGO_mf)
treeplot(gseGO_mf_pwts,
hclust_method = "average", # see hclust () documentation on methods
showCategory = 65, # default 30
nCluster = 20) # default 5 | parent term changes a lot & weird assign at times
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
## The hclust_method parameter will be removed in the next version.
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(n = your_value)' instead of 'nCluster'.
## The nCluster parameter will be removed in the next version.
# works but unstable assignment - look into
sim_gseGO_mf <- simplify(gseGO_mf_pwts,
cutoff=0.7,
by="p.adjust",
select_fun=min)
# plot simplified
emapplot(sim_gseGO_mf)
goplot(gseGO_mf,
showCategory = 25, # default 10
layout = "sugiyama") # default
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
gseGO_bp <- gseGO(geneList = geneList,
ont ="BP", # Biological process
keyType = "ENTREZID",
OrgDb = org.Rn.eg.db, # if you unset it uses bg based on provide geneList?
by = "fgsea",
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = TRUE,
pAdjustMethod = "BH")
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 18 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
# can make into tibble for easy viewing
gseGO_bp_tibble <- as_tibble(gseGO_bp)
# 176 significant beyond adj pvalue 0.05
enrichplot::dotplot(gseGO_bp,
split = ".sign", # split by + vs - enrirchment score
showCategory = 176,
font.size = 11,
label_format = 80,
title = "Housing - EE vs. SH (t8 - t2) - gseGO_bp"
) +
facet_grid(.~.sign) # split plots by sign
#15.5 Tree plot
gseGO_bp_pwts <- pairwise_termsim(gseGO_bp)
treeplot(gseGO_bp_pwts,
hclust_method = "average", # see hclust () documentation on methods
showCategory = 176, # default 30
nCluster = 20, # default 5 | parent term changes a lot & weird assign at times
)
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
## The hclust_method parameter will be removed in the next version.
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(n = your_value)' instead of 'nCluster'.
## The nCluster parameter will be removed in the next version.
# works but unstable assignment - look into
sim_gseGO_bp <- simplify(gseGO_bp_pwts,
cutoff=0.7,
by="p.adjust",
select_fun=min)
# plot simplified
emapplot(sim_gseGO_cc)
~ ~ ~ ~ ~ ~ ~ ~ ~
# look at rat info kegg
rat <- search_kegg_organism('rno', by='kegg_code')
head(rat)
## kegg_code scientific_name common_name
## 33 rno Rattus norvegicus rat
gseKEGG_r <- gseKEGG(geneList = geneList,
organism = 'rno', #RAT
keyType = "kegg",
minGSSize = 10, # min gene set size
maxGSSize = 500, # max gene set size
pvalueCutoff = 0.05, # pvalue cutoff
pAdjustMethod = 'BH', #BejaminiHoc Correciton
verbose = TRUE) # print message or not
## Reading KEGG annotation online: "https://rest.kegg.jp/link/rno/pathway"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/rno"...
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 2 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
gse_kegg_tibble <- as_tibble(gseKEGG_r) # 26 sig or 23?? (0.05)| 32 if adjp = 0.1
enrichplot::dotplot(gseKEGG_r,
split = ".sign", # split by + vs - enrirchment score
showCategory = 30,
title = "Housing - EE vs. SH (t8 - t2) - gseKEGG_r") +
facet_grid(.~.sign) # split plot by seign
Glutamatergic synapse - Rattus norvegicus (rat)
# view browseKEGG from clusterProfiler
# launches to web when function run!
# browseKEGG(gseKEGG_r,
# pathID = 'rno04724') # glutamtergic synapse rat
KEGG - Organismal Systems > Nervous System (10 pathways)
# via pathview::pathview()
# rno04724 <- pathview(gene.data = geneList,
# pathway.id = 'rno04724', # Glutamatergic synapse - Rat
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# writes out png & xml files to source file location
# look into how to plot & also save as PDF
# rno04723 - Retrograde endocannabinoid signaling
# rno04723 <- pathview(gene.data = geneList,
# pathway.id = 'rno04723',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# gives error ???
# rno05016 - Huntington disease | runs okay!
# rno05016 <- pathview(gene.data = geneList,
# pathway.id = 'rno05016',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04722 - neurotrophin_signaling
# rno04722 <- pathview(gene.data = geneList,
# pathway.id = 'rno04722',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04720 - Long Term Potentiation (LTP)
# rno04720 <- pathview(gene.data = geneList,
# pathway.id = 'rno04720',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno05012 - Parkinsons
# rno05012 <- pathview(gene.data = geneList,
# pathway.id = 'rno05012',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04082 - Neuroactive ligand signaling | nice holistic view of many neuroligand paths
# rno04082 <- pathview(gene.data = geneList,
# pathway.id = 'rno04082',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno05033 - Nicotine Addiction | not much just ampa & gaba
# rno05033 <- pathview(gene.data = geneList,
# pathway.id = 'rno05033',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
~ ~ ~
# rno04721 - Synaptic_vesicle_cycle | most up regulated in EE
# rno04721 <- pathview(gene.data = geneList,
# pathway.id = 'rno04721',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04727 - GABAergic_synapse_
# rno04727 <- pathview(gene.data = geneList,
# pathway.id = 'rno04727',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04728 - Dompaminergic_synapse_
# rno04728 <- pathview(gene.data = geneList,
# pathway.id = 'rno04728',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04726 - Serotonergic_synapse_
# rno04726 <- pathview(gene.data = geneList,
# pathway.id = 'rno04726',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04725 - Cholinergic_synapse_ | Interesting seems 2 be mediating synaptic plasticity
# rno04725 <- pathview(gene.data = geneList,
# pathway.id = 'rno04725',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04730 - LTD_ | cerebellum pathview? | otherwise interesting
# rno04730 <- pathview(gene.data = geneList,
# pathway.id = 'rno04730',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
Environmental Information Processing > Signal Transduction
# rno04010 - MAPK_signaling_ | Interesting - cell proliferation & differentiation
# rno04010 <- pathview(gene.data = geneList,
# pathway.id = 'rno04010',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04014 - Ras_signaling_
# rno04014 <- pathview(gene.data = geneList,
# pathway.id = 'rno04014',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
Most significant Pathways
1 - Oxidative Phosphorylation (rno00190) - Metabolism >Energy Metabolism > Ox Pho
# rno00190 - Oxidative_phosphorylation_ | several highly upregulated
# rno00190 <- pathview(gene.data = geneList,
# pathway.id = 'rno00190',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
2 rno05415 Diabetic_cardiomyopathy - Human Diseases; Cardiovascular Disease
# rno05415 - Diabetic_cardiomyopathy_ | upreg @ mitocondrial - what does this mean?
# rno05415 <- pathview(gene.data = geneList,
# pathway.id = 'rno05415',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
3 - Non-alcoholic fatty liver disease rno04932 - Endocrine & metabolic disease
# # rno04932 - Non-alcoholic_fatty_liver_d_ | driven by cx1,3,4 mito resp chain
# rno04932 <- pathview(gene.data = geneList,
# pathway.id = 'rno04932',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
4 - Thermogenesis_ (rno04714) - adipose tissue & keep warm | fat SH rats & skinny EE
# rno04714 - Thermogenesis_ | mitochondrial + others - look into
# rno04714 <- pathview(gene.data = geneList,
# pathway.id = 'rno04714',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
9 - Pathways of neurodegeneration - multiple diseases (rno05022) (neurodegen disea
# rno05022 - Pathways_of_neurodegeneration_multi_ |interesting/confusing
# rno05022 <- pathview(gene.data = geneList,
# pathway.id = 'rno05022',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
Not enriched but interesting
# #rno04081 - Hormone_signaling_ | various guanine nucleotide binding pro & PKA down
# # and only SST (somatostatin) & Opioid (Penk) Up | also only brain specific up
# rno04081 <- pathview(gene.data = geneList,
# pathway.id = 'rno04081',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04080 - Neuroactive_ligand_receptor_interaction_ | not many but interesting - look
# rno04080 <- pathview(gene.data = geneList,
# pathway.id = 'rno04080',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04060 - Cytokine_cytokine_receptor_interaction_ | no up or down
# rno04060 <- pathview(gene.data = geneList,
# pathway.id = 'rno04060',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04512 - ECM_receptor_interaction_ | all down
# rno04512 <- pathview(gene.data = geneList,
# pathway.id = 'rno04512',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04514 - Cell_adhesion_molecules_ | no up or down except CXADR down
# rno04514 <- pathview(gene.data = geneList,
# pathway.id = 'rno04514',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
# rno04360 - Axon_guidance_ | most down except Ras
# rno04360 <- pathview(gene.data = geneList,
# pathway.id = 'rno04360',
# species = 'rno',
# limit = list(gene=max(abs(geneList)), cpd=1))
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
gseMKEGG_r <- gseMKEGG(geneList = geneList,
organism = 'rno',
pvalueCutoff = 1)
## Reading KEGG annotation online: "https://rest.kegg.jp/link/rno/module"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/module"...
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.73% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
## --> Not a KEGG enrichment result
head(gseMKEGG_r)
## ID
## M00146 M00146
## M00158 M00158
## M00154 M00154
## M00009 M00009
## M00147 M00147
## M00011 M00011
## Description
## M00146 NADH dehydrogenase (ubiquinone) 1 alpha subcomplex
## M00158 F-type ATPase, eukaryotes
## M00154 Cytochrome c oxidase
## M00009 Citrate cycle (TCA cycle, Krebs cycle)
## M00147 NADH dehydrogenase (ubiquinone) 1 beta subcomplex
## M00011 Citrate cycle, second carbon oxidation, 2-oxoglutarate => oxaloacetate
## setSize enrichmentScore NES pvalue p.adjust qvalue
## M00146 12 0.8917876 2.424023 6.069907e-07 1.031884e-05 4.472563e-06
## M00158 14 0.8231009 2.324812 8.226993e-06 6.992944e-05 3.030998e-05
## M00154 12 0.8503936 2.311508 1.236428e-05 7.006423e-05 3.036840e-05
## M00009 21 0.7081324 2.200668 6.558425e-05 2.787330e-04 1.208131e-04
## M00147 10 0.8376506 2.172511 1.404557e-04 4.775495e-04 2.069874e-04
## M00011 13 0.7771340 2.168138 1.833869e-04 5.195963e-04 2.252120e-04
## rank leading_edge
## M00146 697 tags=83%, list=10%, signal=75%
## M00158 902 tags=71%, list=13%, signal=63%
## M00154 595 tags=83%, list=8%, signal=77%
## M00009 641 tags=57%, list=9%, signal=52%
## M00147 1054 tags=90%, list=15%, signal=77%
## M00011 641 tags=54%, list=9%, signal=49%
## core_enrichment
## M00146 296658/293453/301123/678759/299739/362440/315167/100911483/681024/291660
## M00158 116550/171374/65262/26196/192241/300677/641434/171375/140608/245965
## M00154 94194/120103152/303393/26198/54322/29507/252934/298762/29445/25282
## M00009 94173/360975/170587/79250/299201/25179/114597/114096/289217/24368/361071/298942
## M00147 689938/301427/361385/293991/681418/299310/293130/100912357/297990
## M00011 360975/299201/114597/289217/24368/361071/298942
gseMKEGG_r_tibble <- as_tibble(gseMKEGG_r) # 17 sig @ adj-pval .05? | why spec as 1 abv?
# im confused - loot at gseMKEGG function later
enrichplot::dotplot(gseMKEGG_r,
split = ".sign", # split by + vs - enrirchment score
showCategory = 30,
title = "Housing - EE vs. SH (t8 - t2) - gseMKEGG_r") +
facet_grid(.~.sign) # split plot by seign